import pyspark.sql.functions as F
from pyspark.sql.types import *
import json,boto3
from datetime import datetime,timedelta
import pandas as pd
%%local
import plotly.express as px
df_all = spark.read.option("header",True).csv("s3://mist-data-science-dev/may/roaming_exploration/daily_event_count")\
.filter(F.col("date")>='2021-02-09')\
.select("date",F.col("event_count").cast(LongType()),"event_type",F.col("count_distinct_wc").cast(LongType())) #F.to_date(F.col("date").cast(TimestampType())).alias("date")
+----------+-----------+-------------+-----------------+ | date|event_count| event_type|count_distinct_wc| +----------+-----------+-------------+-----------------+ |2021-02-16| 678542|sticky-client| 36110| |2021-02-09| 50159|sticky-client| 3156| |2021-02-18| 750229|sticky-client| 37703| |2021-02-10| 585622|sticky-client| 29740| |2021-02-13| 675713|sticky-client| 29465| |2021-02-14| 328811|sticky-client| 24395| |2021-02-15| 590071|sticky-client| 30906| |2021-02-17| 701604|sticky-client| 36645| |2021-02-20| 83597|sticky-client| 9848| |2021-02-19| 815166|sticky-client| 38330| |2021-02-12| 797668|sticky-client| 35167| |2021-02-11| 701694|sticky-client| 34820| |2021-02-16| 116597768| client-roam| 3332797| |2021-02-09| 42766885| client-roam| 2146350| |2021-02-18| 123070329| client-roam| 3576829| |2021-02-10| 138311387| client-roam| 4007565| |2021-02-13| 140362282| client-roam| 4133301| |2021-02-14| 123084502| client-roam| 3555420| |2021-02-15| 112711903| client-roam| 3202255| |2021-02-17| 123758882| client-roam| 3595755| +----------+-----------+-------------+-----------------+ only showing top 20 rows
%%spark -o df_all
%%local
# Event Comparison
px.bar(df_all,x="date", y="event_count", color="event_type", barmode="group")
%%local
# Event Count in log scale
px.bar(df_all,x="date", y="event_count", color="event_type", barmode="group",log_y=True)
%%local
#Count Distinct WC comparison
px.bar(df_all,x="date", y="count_distinct_wc", color="event_type", barmode="group")
%%local
# Average Event Per Client
df_all["avg_event_per_WC"] = df_all[["count_distinct_wc","event_count"]].apply(lambda x:x[1]/x[0],axis=1)
px.bar(df_all,x="date", y="avg_event_per_WC", color="event_type", barmode="group")
# Client Roam Raw Data
input_files = "s3://mist-secorapp-production/client-roam/client-roam-production/dt=2021-02-*/*/*.seq"
data_rdd = spark.sparkContext.sequenceFile(input_files).map(lambda r: json.loads(r[1]))
df_croam= spark.createDataFrame(data_rdd)
df_croam.printSchema()
df_croam.show(4)
root |-- AssocFail: boolean (nullable = true) |-- AssocFailReason: long (nullable = true) |-- AuthAlgo: long (nullable = true) |-- AuthFail: boolean (nullable = true) |-- AuthFailReason: long (nullable = true) |-- AuthType: string (nullable = true) |-- Band: string (nullable = true) |-- Capabilities: long (nullable = true) |-- FastRoamFail: boolean (nullable = true) |-- FastRoamFailCause: long (nullable = true) |-- FromAP: string (nullable = true) |-- OrgID: string (nullable = true) |-- PotentialCandidate: boolean (nullable = true) |-- PrevBand: string (nullable = true) |-- Roamtype: string (nullable = true) |-- SSID: string (nullable = true) |-- SinceAssoc: long (nullable = true) |-- SinceAuth: double (nullable = true) |-- SiteID: string (nullable = true) |-- State: boolean (nullable = true) |-- TimeDelta: long (nullable = true) |-- ToAP: string (nullable = true) |-- Version: long (nullable = true) |-- WC: string (nullable = true) |-- When: string (nullable = true) |-- WlanID: string (nullable = true) +---------+---------------+--------+--------+--------------+--------+----+------------+------------+-----------------+-----------------+--------------------+------------------+--------+---------+----------------+----------+-------------------+--------------------+-----+-----------+-----------------+-------+-----------------+--------------------+--------------------+ |AssocFail|AssocFailReason|AuthAlgo|AuthFail|AuthFailReason|AuthType|Band|Capabilities|FastRoamFail|FastRoamFailCause| FromAP| OrgID|PotentialCandidate|PrevBand| Roamtype| SSID|SinceAssoc| SinceAuth| SiteID|State| TimeDelta| ToAP|Version| WC| When| WlanID| +---------+---------------+--------+--------+--------------+--------+----+------------+------------+-----------------+-----------------+--------------------+------------------+--------+---------+----------------+----------+-------------------+--------------------+-----+-----------+-----------------+-------+-----------------+--------------------+--------------------+ | false| 0| 0| false| 0| open| 2.4| 0| false| 0|5c-5b-35-52-83-bb|bbb101eb-b62d-4fb...| false| 2.4|slow_roam| Walmartwifi_2.4| 0|9.223372036854776E9|72cbdd43-6b1e-4f8...| true| 8365155740|5c-5b-35-52-82-44| 1|02-d1-74-f6-97-53|2021-02-09T17:32:...|5f318cfe-b216-4b8...| | false| 0| 0| false| 0| open| 2.4| 0| false| 0|5c-5b-35-4e-7f-a3|bbb101eb-b62d-4fb...| false| 2.4|slow_roam| Walmartwifi_2.4| 0|9.223372036854776E9|1567d1de-2168-459...| true|13141419987|5c-5b-35-4e-86-42| 1|ee-cf-25-3b-04-a8|2021-02-09T17:32:...|5f318cfe-b216-4b8...| | false| 0| 0| false| 0| psk| 5| 0| false| 0|5c-5b-35-ae-f2-49|bbb101eb-b62d-4fb...| false| 5|slow_roam| MEviuPuoi311| null| 0.046012684|39e0a33a-a4de-447...| true| 1873985548|5c-5b-35-ae-f2-44| 1|94-fb-29-2e-e8-33|2021-02-09T17:32:...|6c6a5b39-dbf3-45e...| | false| 0| 0| false| 0| open| 2.4| 0| false| 0|5c-5b-35-af-32-9f|604411f1-4e45-4be...| false| 2.4|slow_roam|SamsClubWiFi_2.4| 0|9.223372036854776E9|fd3ac764-9f70-431...| true| 3304891311|5c-5b-35-af-31-e6| 1|b6-cf-f6-fa-9d-d1|2021-02-09T17:32:...|fa1b1bd8-b619-422...| +---------+---------------+--------+--------+--------------+--------+----+------------+------------+-----------------+-----------------+--------------------+------------------+--------+---------+----------------+----------+-------------------+--------------------+-----+-----------+-----------------+-------+-----------------+--------------------+--------------------+ only showing top 4 rows /usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/session.py:366: UserWarning: Using RDD of dict to inferSchema is deprecated. Use pyspark.sql.Row instead
df_croam.groupby("FastRoamFailCause").agg(F.count("WC").alias("event_count")).show(100)
+-----------------+-----------+ |FastRoamFailCause|event_count| +-----------------+-----------+ | 79| 258821| | 1| 18| | 53| 41438| +-----------------+-----------+
# get daily event count
def get_date(dt_in=None):
try:
temp = datetime.strptime(dt_in[0:19], '%Y-%m-%dT%H:%M:%S')
return str(temp.date())
except:
return None
get_dateUDF = F.udf(get_date,StringType())
temp = df_croam.filter(F.col("FastRoamFail")==True).withColumn("date",get_dateUDF("When")).groupby("date","Band").agg(F.count("WC").alias("event_count"))
temp.cache()
%%spark -o temp
%%local
px.bar(temp,x="date",y="event_count",color="Band",barmode="group",title = "Event Count of FastRoamFail.")
df_cr = spark.read.parquet("s3://mist-data-science-dev/may/roaming_exploration/client_roam_latency/").filter(F.col("Roamtype").isin("slow_roam","fast_roam"))
df_cr.show(5)
+---------+----+--------+------+----------+-----------------+-----------+------------------+--------------+-----------------+--------------------+----------------+---------------+ | Roamtype|Band|PrevBand|SameAP| date|count_Distinct_WC|event_count|median_RoamLatency|RoamLatency_75| RoamLatency_95| mean_RoamLatency| max_RoamLatency|min_RoamLatency| +---------+----+--------+------+----------+-----------------+-----------+------------------+--------------+-----------------+--------------------+----------------+---------------+ |slow_roam| 5| | false|2021-02-12| 3661| 24403| 0.199| 0.236| 0.268| 37.70490873485073| 125065.38| 0.0| |fast_roam| 5| 2.4| false|2021-02-15| 76761| 380754| 0.016| 0.022| 0.06| 32.19474991935524| 1503584.096| 0.005| |fast_roam| 2.4| 5| true|2021-02-14| 11659| 46353| 0.014| 0.024| 0.07|0.023915727133143213| 1.159| 0.005| |slow_roam| 5| 2.4| true|2021-02-11| 175063| 929922| 0.027| 0.206|42.15284999999986| 10677.346491596187|1.613080711166E9| 0.0| |slow_roam| 5| 2.4| true|2021-02-10| 178496| 942220| 0.027| 0.21| 41.312| 4151.777881738804| 2.9801002192E7| 0.0| +---------+----+--------+------+----------+-----------------+-----------+------------------+--------------+-----------------+--------------------+----------------+---------------+ only showing top 5 rows
%%spark -o df_cr
%%local
df_cr["Band_change"] = df_cr[["Band","PrevBand"]].apply(lambda x: "{} to {}".format(x[1],x[0]),axis=1)
#df_cr.head()
px.bar(df_cr[df_cr["SameAP"]==True].groupby(["date","Band_change","Roamtype"]).event_count.sum().reset_index(),x="date", y="event_count", color="Band_change", barmode="group",title="Client Roam to Same AP",hover_data=['Roamtype', 'event_count'])
# roam to same AP
df_croam.filter((F.col("FromAP")==F.col("ToAP")) & (F.col("Band")==F.col("PrevBand"))).show(5)
+---------+---------------+--------+--------+--------------+--------+----+------------+------------+-----------------+-----------------+--------------------+------------------+--------+---------+-----------+----------+-------------------+--------------------+-----+-----------+-----------------+-------+-----------------+--------------------+--------------------+ |AssocFail|AssocFailReason|AuthAlgo|AuthFail|AuthFailReason|AuthType|Band|Capabilities|FastRoamFail|FastRoamFailCause| FromAP| OrgID|PotentialCandidate|PrevBand| Roamtype| SSID|SinceAssoc| SinceAuth| SiteID|State| TimeDelta| ToAP|Version| WC| When| WlanID| +---------+---------------+--------+--------+--------------+--------+----+------------+------------+-----------------+-----------------+--------------------+------------------+--------+---------+-----------+----------+-------------------+--------------------+-----+-----------+-----------------+-------+-----------------+--------------------+--------------------+ | false| 0| 0| false| 0| open| 5| 0| false| 0|d4-20-b0-c0-ad-1b|b4f71ea9-1c74-4b6...| false| 5|slow_roam|DPS_Visitor| 0|9.223372036854776E9|8d365b0a-4333-449...| true| 1081451591|d4-20-b0-c0-ad-1b| 1|8a-cf-5e-0d-91-f5|2021-02-09T17:43:...|82e9e433-ccf1-442...| | false| 0| 2| false| 0| psk11r| 5| 128| false| 0|5c-5b-35-50-20-bc|5cc036bb-1f32-4f3...| false| 5|fast_roam| TVG| null| 0.007939465|d6ece5ff-f8cc-437...| true|32502450647|5c-5b-35-50-20-bc| 1|22-3d-fc-da-81-6a|2021-02-09T17:43:...|0b772fa9-84ec-438...| | false| 0| 0| false| 0| open| 5| 0| false| 0|d4-20-b0-c0-ad-1b|b4f71ea9-1c74-4b6...| false| 5|slow_roam|DPS_Visitor| 0|9.223372036854776E9|8d365b0a-4333-449...| true|11680282160|d4-20-b0-c0-ad-1b| 1|8a-cf-5e-0d-91-f5|2021-02-09T17:43:...|82e9e433-ccf1-442...| | false| 0| 0| false| 0| open| 5| 0| false| 0|d4-20-b0-c0-ad-75|b4f71ea9-1c74-4b6...| false| 5|slow_roam|DPS_Visitor| 0|9.223372036854776E9|8d365b0a-4333-449...| true| 4525518938|d4-20-b0-c0-ad-75| 1|8a-cf-5e-0d-91-f5|2021-02-09T17:44:...|82e9e433-ccf1-442...| | false| 0| 0| false| 0| psk| 5| 0| false| 0|5c-5b-35-d0-d4-bc|d6a1ec7f-a046-4d6...| false| 5|slow_roam| Walnut| null| 1.020448954|a40b5a6d-8d11-460...| true| 407402845|5c-5b-35-d0-d4-bc| 1|5e-a9-96-d1-63-66|2021-02-09T17:58:...|bb84c133-0b61-451...| +---------+---------------+--------+--------+--------------+--------+----+------------+------------+-----------------+-----------------+--------------------+------------------+--------+---------+-----------+----------+-------------------+--------------------+-----+-----------+-----------------+-------+-----------------+--------------------+--------------------+ only showing top 5 rows
%%local
df_cr["Band_change"] = df_cr[["Band","PrevBand"]].apply(lambda x: "{} to {}".format(x[1],x[0]),axis=1)
#df_cr.head()
px.bar(df_cr[df_cr["SameAP"]==False].groupby(["date","Band_change","Roamtype"]).event_count.sum().reset_index(),x="date", y="event_count", color="Band_change", barmode="group",title="Client Roam to Different AP",hover_data=['Roamtype', 'event_count'])
%%local
px.bar(df_cr.groupby(["Band_change","Roamtype","SameAP"]).RoamLatency_95.mean().reset_index(),x="Band_change", y="RoamLatency_95", color="SameAP", barmode="group",title="95 Percentile of Roaming Latency",hover_data=['Roamtype', 'SameAP'])
df_cr.filter(F.col("SameAP")==True).sort(F.desc("RoamLatency_95")).show()
+---------+----+--------+------+----------+-----------------+-----------+--------------------+-------------------+------------------+------------------+----------------+---------------+ | Roamtype|Band|PrevBand|SameAP| date|count_Distinct_WC|event_count| median_RoamLatency| RoamLatency_75| RoamLatency_95| mean_RoamLatency| max_RoamLatency|min_RoamLatency| +---------+----+--------+------+----------+-----------------+-----------+--------------------+-------------------+------------------+------------------+----------------+---------------+ |slow_roam| 2.4| 2.4| true|2021-02-10| 9| 10| 0.0155| 0.03775| 679.4622999999988|124.09969999999998| 1210.78| 0.008| |slow_roam| 5| 2.4| true|2021-02-14| 121114| 593416| 0.029| 0.447| 69.28895| 16581.88331065381|1.613340787062E9| 0.0| |slow_roam| 5| 2.4| true|2021-02-13| 139781| 691900| 0.028| 0.418| 65.40479999999982|16176.425693717007|1.613218396687E9| 0.0| |slow_roam| 5| 2.4| true|2021-02-19| 168808| 883373| 0.027| 0.302| 59.54759999999986|19216.169862918014|1.613771061446E9| 0.0| |slow_roam| 5| 2.4| true|2021-02-20| 6709| 11091| 0.028|0.40874999999999995| 56.41619999999999|10739.144987986632| 2.0163135987E7| 0.0| |slow_roam| 5| 2.4| true|2021-02-12| 171651| 894795| 0.027| 0.285|55.581099999999864| 18156.11396634966| 1.6131635232E9| 0.0| |slow_roam| 5| 2.4| true|2021-02-09| 84424| 302173| 0.027| 0.309| 49.70619999999999|3204.2414355861215| 2.1013274205E7| 0.0| |slow_roam| 5| 2.4| true|2021-02-15| 132240| 676711| 0.028| 0.338| 48.94514999999996| 10958.17332142937|1.613403951281E9| 0.0| |slow_roam| 5| 2.4| true|2021-02-16| 153133| 831324| 0.027| 0.236| 44.955| 6486.135178903683| 3.171263531E7| 0.0| |slow_roam| 5| 2.4| true|2021-02-17| 166072| 887891| 0.027| 0.227| 42.31314999999997| 8771.003002081856|1.613604858282E9| 0.0| |slow_roam| 5| 2.4| true|2021-02-11| 175063| 929922| 0.027| 0.206| 42.15284999999986|10677.346491596187|1.613080711166E9| 0.0| |slow_roam| 5| 2.4| true|2021-02-18| 164975| 880116| 0.027| 0.213| 42.09279999999999|25150.693631628008|1.613661160868E9| 0.0| |slow_roam| 5| 2.4| true|2021-02-10| 178496| 942220| 0.027| 0.21| 41.312| 4151.777881738804| 2.9801002192E7| 0.0| |slow_roam| 2.4| 2.4| true|2021-02-19| 12| 13| 0.029| 0.103|31.161599999999993| 4.875| 32.808| 0.01| |fast_roam| 2.4| 2.4| true|2021-02-09| 10| 10|0.013000000000000001| 22.52375| 30.03825| 9.0187| 30.045| 0.008| |slow_roam| 2.4| 2.4| true|2021-02-17| 14| 14| 0.062| 0.27475| 30.03725| 6.487142857142858| 30.073| 0.008| |slow_roam| 2.4| 2.4| true|2021-02-15| 9| 9| 0.017| 30.01| 30.0368|10.245444444444445| 30.046| 0.006| |slow_roam| 2.4| 2.4| true|2021-02-18| 13| 18| 0.024| 2.4044999999999996|30.025000000000002|4.0357777777777795| 30.11| 0.008| |slow_roam| 5| 5| true|2021-02-12| 465| 821| 0.127| 1.094|29.869799999999994|2055.5483465211464| 94989.49| 0.0| |slow_roam| 2.4| 2.4| true|2021-02-09| 3| 3| 8.78| 19.4115|27.916699999999995|12.946333333333333| 30.043| 0.016| +---------+----+--------+------+----------+-----------------+-----------+--------------------+-------------------+------------------+------------------+----------------+---------------+ only showing top 20 rows
# Suboptimal Roam Raw Data
input_files = "s3://mist-secorapp-production/client-suboptimal-roaming/client-suboptimal-roaming-production/dt=2021-02-*/*/*.seq"
data_rdd = spark.sparkContext.sequenceFile(input_files).map(lambda r: json.loads(r[1]))
df_suboptimal_roam= spark.createDataFrame(data_rdd)
df_suboptimal_roam.printSchema()
df_suboptimal_roam.filter(F.col("PrevRSSI")<0).show(5)
root |-- CurrentRSSI: long (nullable = true) |-- FromAP: string (nullable = true) |-- OrgID: string (nullable = true) |-- PrevRSSI: long (nullable = true) |-- SiteID: string (nullable = true) |-- ToAP: string (nullable = true) |-- Version: long (nullable = true) |-- WC: string (nullable = true) |-- When: string (nullable = true) |-- WlanID: string (nullable = true) +-----------+-----------------+--------------------+--------+--------------------+-----------------+-------+-----------------+--------------------+--------------------+ |CurrentRSSI| FromAP| OrgID|PrevRSSI| SiteID| ToAP|Version| WC| When| WlanID| +-----------+-----------------+--------------------+--------+--------------------+-----------------+-------+-----------------+--------------------+--------------------+ | -90|5c-5b-35-9f-47-12|bbb101eb-b62d-4fb...| -81|1c1d1ae6-3468-4a0...|5c-5b-35-9e-df-70| 1|82-be-c0-cf-ae-98|2021-01-31T23:29:...|28cac5b0-43ca-4ae...| | -76|d4-20-b0-40-15-4a|bbb101eb-b62d-4fb...| -65|64a5323e-e49e-480...|d4-20-b0-40-06-4f| 1|7e-4b-49-0c-78-1f|2021-01-31T23:30:...|5f318cfe-b216-4b8...| | -77|5c-5b-35-8e-76-52|a8c903ae-e6d0-42a...| -62|3eefeb57-47f4-4d5...|5c-5b-35-8e-76-7f| 1|06-48-3d-ad-72-f9|2021-02-01T00:00:...|b29415a8-7155-402...| | -73|d4-20-b0-c0-90-ce|bbb101eb-b62d-4fb...| -65|d3d1e792-f94a-476...|d4-20-b0-c1-1b-b6| 1|36-88-3c-36-68-24|2021-01-31T23:30:...|28cac5b0-43ca-4ae...| | -73|5c-5b-35-bf-04-73|604411f1-4e45-4be...| -57|6eb59d1f-a52d-492...|5c-5b-35-bf-03-c4| 1|0e-fd-0d-63-d6-15|2021-01-31T23:30:...|fa1b1bd8-b619-422...| +-----------+-----------------+--------------------+--------+--------------------+-----------------+-------+-----------------+--------------------+--------------------+ only showing top 5 rows
# top 10 orgid for suboptimal roam
df_suboptimal_roam.groupby("OrgID").agg(F.count("WC").alias("event_count")).sort(F.desc("event_count")).show(10,truncate=False)
+------------------------------------+-----------+ |OrgID |event_count| +------------------------------------+-----------+ |bbb101eb-b62d-4fb1-8c3d-030c6db7e208|2625234 | |604411f1-4e45-4bed-9a69-cc37b247fdf9|589738 | |0f2adb1c-8843-448f-8753-91a3d2fdbd14|236346 | |c1cac1c4-1753-4dde-a065-e17a1c305c2d|152458 | |d9ca39f2-a2cd-40af-b095-eecfbeeb8a12|112955 | |d8cec22e-e0c2-11e5-8d0f-02e208b2d34f|86866 | |56de201d-e63b-4312-9858-40f4cfe35c7f|57721 | |39b04356-4cc3-41cc-bc1a-5de54beb1941|53208 | |22f1cc2d-ea8a-47ea-b4c0-689a86a0bedf|41969 | |e1bde705-bd39-4089-8e3d-b897d0b660fa|37334 | +------------------------------------+-----------+ only showing top 10 rows
df_sr= spark.read.parquet("s3://mist-data-science-dev/may/roaming_exploration/total_suboptimal_roam_rssi_diff/")
df_sr.show()
+------+-----------------+-----------+----------------+------------+------------+-------------------+-------------+-------------+ |SameAP|count_distinct_wc|event_count|median_RSSI_diff|RSSI_diff_75|RSSI_diff_95| mean_RSSI_diff|max_RSSI_diff|min_RSSI_diff| +------+-----------------+-----------+----------------+------------+------------+-------------------+-------------+-------------+ | true| 64646| 282780| -12.0| -9.0| -7.0|-13.157627837895184| -7| -86| | false| 457452| 945583| -12.0| -9.0| -7.0| -13.82344648751088| -7| -91| +------+-----------------+-----------+----------------+------------+------------+-------------------+-------------+-------------+
df_sr = spark.read.parquet("s3://mist-data-science-dev/may/roaming_exploration/suboptimal_roam_rssi_diff/")
df_sr.show(5)
+----------+------+-----------------+-----------+----------------+------------+------------+-------------------+-------------+-------------+ | date|SameAP|count_distinct_wc|event_count|median_RSSI_diff|RSSI_diff_75|RSSI_diff_95| mean_RSSI_diff|max_RSSI_diff|min_RSSI_diff| +----------+------+-----------------+-----------+----------------+------------+------------+-------------------+-------------+-------------+ |2021-02-11| true| 6333| 15193| -11.0| -9.0| -7.0|-12.932666359507667| -7| -53| |2021-02-10| true| 6827| 16667| -12.0| -9.0| -7.0|-13.074458510829784| -7| -58| |2021-02-02| false| 29072| 38355| -12.0| -9.0| -7.0| -13.96735758049798| -7| -90| |2021-02-16| false| 22401| 28409| -12.0| -9.0| -7.0|-14.056284980111936| -7| -74| |2021-02-14| true| 4058| 11907| -12.0| -9.0| -7.0|-13.271604938271604| -7| -51| +----------+------+-----------------+-----------+----------------+------------+------------+-------------------+-------------+-------------+ only showing top 5 rows
%%spark -o df_sr
%%local
# suboptimal event comp betwee same and different AP.
px.bar(df_sr.groupby(["date","SameAP"]).event_count.sum().reset_index(),x="date",y="event_count",color="SameAP",barmode="group",title="Suboptimal Event Count to Same AP vs Different AP")
%%local
px.histogram(df_sr, x="mean_RSSI_diff")
#df_sr["mean_RSSI_diff"].hist()
#ping Pong roam raw data
input_files = "s3://mist-secorapp-production/client-pingpong-roaming/client-pingpong-roaming-production/dt=2021-02*/*/*.seq"
data_rdd = spark.sparkContext.sequenceFile(input_files).map(lambda r: json.loads(r[1]))
df_pp= spark.createDataFrame(data_rdd)
df_pp.printSchema()
df_pp.show(5)
root |-- FromAP: string (nullable = true) |-- NumRoams: long (nullable = true) |-- OrgID: string (nullable = true) |-- SiteID: string (nullable = true) |-- ToAP: string (nullable = true) |-- Version: long (nullable = true) |-- WC: string (nullable = true) |-- When: string (nullable = true) |-- Window: long (nullable = true) |-- WlanID: string (nullable = true) +-----------------+--------+--------------------+--------------------+-----------------+-------+-----------------+--------------------+------------+--------------------+ | FromAP|NumRoams| OrgID| SiteID| ToAP|Version| WC| When| Window| WlanID| +-----------------+--------+--------------------+--------------------+-----------------+-------+-----------------+--------------------+------------+--------------------+ |5c-5b-35-d2-4b-5d| 3|bbb101eb-b62d-4fb...|94e4e435-136b-49c...|d4-20-b0-c2-1b-06| 1|c6-4b-52-48-97-9c|2021-02-01T00:00:...|-10034829438|28cac5b0-43ca-4ae...| |5c-5b-35-d1-01-ad| 3|bbb101eb-b62d-4fb...|9b1b9cfc-a561-43d...|5c-5b-35-d1-01-4e| 1|3a-f6-5a-87-4f-b5|2021-02-01T00:00:...|-19738379302|5f318cfe-b216-4b8...| |5c-5b-35-af-1f-e9| 3|bbb101eb-b62d-4fb...|08216ecc-4dfa-494...|5c-5b-35-af-1e-22| 1|0c-2f-b0-c6-44-00|2021-02-01T00:08:...| -2592395537|5f318cfe-b216-4b8...| |d4-20-b0-81-47-e4| 3|d8cec22e-e0c2-11e...|d6d033b3-1274-48c...|d4-20-b0-81-46-b3| 1|8c-c8-4b-b1-13-9b|2021-02-01T00:11:...| -649432157|3d393407-d3db-42e...| |5c-5b-35-ae-b9-46| 3|604411f1-4e45-4be...|6c7657a9-8dc5-412...|5c-5b-35-ae-c2-4c| 1|70-ce-8c-a2-ce-c2|2021-02-01T00:21:...| -701470121|fa1b1bd8-b619-422...| +-----------------+--------+--------------------+--------------------+-----------------+-------+-----------------+--------------------+------------+--------------------+ only showing top 5 rows
# top 10 ping pong roam orgid
df_pp.groupby("OrgID").agg(F.count("WC").alias("event_count")).sort(F.desc("event_count")).show(10,truncate=False)
+------------------------------------+-----------+ |OrgID |event_count| +------------------------------------+-----------+ |bbb101eb-b62d-4fb1-8c3d-030c6db7e208|29383 | |52ca4540-62b5-4ed1-89a3-6ab60647c7dd|4910 | |d9ca39f2-a2cd-40af-b095-eecfbeeb8a12|4722 | |56de201d-e63b-4312-9858-40f4cfe35c7f|4411 | |2eed5700-cb1e-4256-8aa3-73cb580af144|3108 | |604411f1-4e45-4bed-9a69-cc37b247fdf9|2950 | |d8cec22e-e0c2-11e5-8d0f-02e208b2d34f|2674 | |790be815-af8f-4dc5-b8c2-e01b8a107db4|1886 | |3098d079-88b7-4383-91fd-5eb46a101365|1527 | |5a578f3c-f094-4d13-a7dd-96f0aaecb5b6|1173 | +------------------------------------+-----------+ only showing top 10 rows
df_pp.groupby("NumRoams").agg(F.count("WC")).show(10)
+--------+---------+ |NumRoams|count(WC)| +--------+---------+ | 3| 68387| +--------+---------+
df_pp = spark.read.parquet("s3://mist-data-science-dev/may/roaming_exploration/pingpong_roam_agg/").filter(F.col("IsPingPong")==True).withColumn("day",F.date_format("date", 'E'))
df_pp.show(5)
+----------+------+----------+-----------+------------------+--------------------------+----------------------+----------------------+------------------------+-----------------------+-----------------------+---+ | date|SameAP|IsPingPong|event_count|count_distinct_mac|median_pingpong_duration_s|pingpong_duration_s_75|pingpong_duration_s_95|mean_pingpong_duration_s|max_pingpong_duration_s|min_pingpong_duration_s|day| +----------+------+----------+-----------+------------------+--------------------------+----------------------+----------------------+------------------------+-----------------------+-----------------------+---+ |2021-02-12| true| true| 251| 214| 8.796026571| 24.950885563| 75.33606464600001| 19.346143137553785| 114.748611639| 0.006200507|Fri| |2021-02-16| true| true| 67| 62| 8.749670674| 25.993954756| 65.59251198349997| 19.050224249925375| 94.77926207| 0.028822597|Tue| |2021-02-08| true| true| 251| 216| 10.117249503| 20.318475960500002| 70.05992217400001| 17.57307894453386| 107.818753423| 0.248847088|Mon| |2021-02-11| false| true| 3384| 2833| 6.26386957| 17.5828943265| 61.77535126550001| 14.242095053413708| 119.219198043| 0.0|Thu| |2021-02-17| true| true| 100| 74| 12.2038319165| 39.97108211525| 90.68115202579999| 24.836056458410003| 102.744723159| 0.063464121|Wed| +----------+------+----------+-----------+------------------+--------------------------+----------------------+----------------------+------------------------+-----------------------+-----------------------+---+ only showing top 5 rows
%%spark -o df_pp
%%local
px.bar(df_pp.groupby(["date","SameAP"]).event_count.sum().reset_index(),x="date", y="event_count", color="SameAP", barmode="group")
%%local
# Pingpong duration
px.bar(df_pp.groupby(["date","SameAP"]).median_pingpong_duration_s.sum().reset_index(),x="date", y="median_pingpong_duration_s", color="SameAP", barmode="group")
# Sticky Client Raw Data
input_files = "s3://mist-secorapp-production/sticky-client/sticky-client-production/dt=2021-02-10/*/*.seq"
data_rdd = spark.sparkContext.sequenceFile(input_files).map(lambda r: json.loads(r[1])).filter(lambda r: r["Sticky"]==True)
df_sc= spark.createDataFrame(data_rdd)
df_sc.printSchema()
df_sc.show()
print(df_sc.limit(1).collect())
root
|-- Assoc: map (nullable = true)
| |-- key: string
| |-- value: string (valueContainsNull = true)
|-- RoamCandidates: array (nullable = true)
| |-- element: map (containsNull = true)
| | |-- key: string
| | |-- value: string (valueContainsNull = true)
|-- Sticky: boolean (nullable = true)
|-- Version: long (nullable = true)
|-- WC: string (nullable = true)
|-- When: string (nullable = true)
+--------------------+--------------------+------+-------+-----------------+--------------------+
| Assoc| RoamCandidates|Sticky|Version| WC| When|
+--------------------+--------------------+------+-------+-----------------+--------------------+
|[OrgID -> 4f7ce01...|[[RSSI -> -72, Ba...| true| 3|e2-9e-cb-1b-1a-a8|2021-02-10T00:45:...|
|[OrgID -> 4f7ce01...|[[RSSI -> -67, Ba...| true| 3|ca-47-45-d4-5c-a7|2021-02-10T00:45:...|
|[OrgID -> 4f7ce01...|[[RSSI -> -58, Ba...| true| 3|c0-84-7d-20-d6-ac|2021-02-10T00:45:...|
|[OrgID -> 4f7ce01...|[[RSSI -> -53, Ba...| true| 3|ee-56-ae-ba-96-eb|2021-02-10T00:45:...|
|[OrgID -> 4f7ce01...|[[RSSI -> -64, Ba...| true| 3|52-99-69-45-97-78|2021-02-10T00:45:...|
|[OrgID -> 0992350...|[[RSSI -> -62, Ba...| true| 3|f8-ff-c2-4e-24-d0|2021-02-10T00:45:...|
|[OrgID -> 4f7ce01...|[[RSSI -> -64, Ba...| true| 3|c0-84-7d-c9-45-2e|2021-02-10T00:45:...|
|[OrgID -> 4f7ce01...|[[RSSI -> -44, Ba...| true| 3|8c-c6-81-a1-19-d1|2021-02-10T00:45:...|
|[OrgID -> 4f7ce01...|[[RSSI -> -61, Ba...| true| 3|da-33-cb-c3-96-12|2021-02-10T00:45:...|
|[OrgID -> 4f7ce01...|[[RSSI -> -67, Ba...| true| 3|56-51-63-1f-f9-15|2021-02-10T00:45:...|
|[OrgID -> 4f7ce01...|[[RSSI -> -70, Ba...| true| 3|ca-c5-e0-62-9c-56|2021-02-10T00:45:...|
|[OrgID -> a7cc116...|[[RSSI -> -60, Ba...| true| 3|2a-07-4a-f3-29-8e|2021-02-10T00:45:...|
|[OrgID -> 4f7ce01...|[[RSSI -> -64, Ba...| true| 3|b2-a9-4a-7a-16-a5|2021-02-10T00:46:...|
|[OrgID -> 2cd327d...|[[RSSI -> -71, Ba...| true| 3|98-6c-f5-56-af-ba|2021-02-10T00:46:...|
|[OrgID -> 4f7ce01...|[[RSSI -> -65, Ba...| true| 3|9e-d4-74-46-eb-7e|2021-02-10T00:46:...|
|[OrgID -> 4f7ce01...|[[RSSI -> -70, Ba...| true| 3|ca-c5-e0-62-9c-56|2021-02-10T00:46:...|
|[OrgID -> 4f7ce01...|[[RSSI -> -64, Ba...| true| 3|26-c2-01-41-d6-65|2021-02-10T00:46:...|
|[OrgID -> 313cd17...|[[RSSI -> -63, Ba...| true| 3|00-14-a5-ca-19-a8|2021-02-10T00:46:...|
|[OrgID -> a15b4c5...|[[RSSI -> -64, Ba...| true| 3|5c-1d-d9-0d-a9-cc|2021-02-10T00:46:...|
|[OrgID -> 0992350...|[[RSSI -> -67, Ba...| true| 3|00-f6-20-57-c6-cf|2021-02-10T00:46:...|
+--------------------+--------------------+------+-------+-----------------+--------------------+
only showing top 20 rows
[Row(Assoc={'OrgID': '4f7ce014-2148-4d41-88f0-07c9c2e6f16c', 'SiteID': 'c8b285e1-bb09-4fa1-8e71-2c2e865cee2b', 'BSSID': '5c-5b-35-e1-41-c3', 'Band': '5', 'WLAN': '1fc8eb8a-56b0-4b9e-b2ae-2391d2be4eb8', 'AvgRSSI': '-77', 'SSID': 'iSPOTA1', 'AP': '5c-5b-35-d0-46-b9'}, RoamCandidates=[{'RSSI': '-72', 'Band': '5', 'BSSID': '5c-5b-35-e1-09-23', 'AP': '5c-5b-35-d0-40-d3'}], Sticky=True, Version=3, WC='e2-9e-cb-1b-1a-a8', When='2021-02-10T00:45:48.484555937Z')]
df_sc = spark.read.parquet("s3://mist-data-science-dev/may/roaming_exploration/sticky_client_flatten/").withColumn("RSSI_diff",F.col("CurrentRSSI")-F.col("Candidate_RSSI"))
df_sc.show()
+-----------------+--------------------+--------------------+----+-----------+-----------------+--------------+--------------+-----------------+--------------------+----------+---------+ | WC| OrgID| SiteID|Band|CurrentRSSI| currentAP|Candidate_Band|Candidate_RSSI| Candidate_AP| When| date|RSSI_diff| +-----------------+--------------------+--------------------+----+-----------+-----------------+--------------+--------------+-----------------+--------------------+----------+---------+ |0a-5d-01-de-34-dd|0992350f-e897-471...|6532c75a-c109-4f8...| 5| -73|5c-5b-35-d3-f3-95| 5| -55|5c-5b-35-d3-f2-37|2021-02-09T22:59:...|2021-02-09| -18.0| |0a-5d-01-de-34-dd|0992350f-e897-471...|6532c75a-c109-4f8...| 5| -73|5c-5b-35-d3-f3-95| 2.4| -55|5c-5b-35-d3-f2-37|2021-02-09T22:59:...|2021-02-09| -18.0| |0a-5d-01-de-34-dd|0992350f-e897-471...|6532c75a-c109-4f8...| 5| -73|5c-5b-35-d3-f3-95| 5| -68|d4-20-b0-c0-30-1b|2021-02-09T22:59:...|2021-02-09| -5.0| |0a-5d-01-de-34-dd|0992350f-e897-471...|6532c75a-c109-4f8...| 5| -73|5c-5b-35-d3-f3-95| 2.4| -68|d4-20-b0-c0-30-1b|2021-02-09T22:59:...|2021-02-09| -5.0| |10-2c-6b-0c-47-14|4f7ce014-2148-4d4...|aba829e6-e38b-4b8...| 5| -74|5c-5b-35-d0-3e-cb| 5| -55|5c-5b-35-d0-3e-53|2021-02-09T23:00:...|2021-02-09| -19.0| |10-2c-6b-0c-47-14|4f7ce014-2148-4d4...|aba829e6-e38b-4b8...| 5| -74|5c-5b-35-d0-3e-cb| 2.4| -55|5c-5b-35-d0-3e-53|2021-02-09T23:00:...|2021-02-09| -19.0| |10-2c-6b-1a-4d-52|4f7ce014-2148-4d4...|cd915440-6c9f-42c...| 5| -74|5c-5b-35-d0-34-53| 5| -70|5c-5b-35-d1-94-56|2021-02-09T23:00:...|2021-02-09| -4.0| |10-2c-6b-1a-4d-52|4f7ce014-2148-4d4...|cd915440-6c9f-42c...| 5| -74|5c-5b-35-d0-34-53| 2.4| -70|5c-5b-35-d1-94-56|2021-02-09T23:00:...|2021-02-09| -4.0| |42-b2-7f-66-65-ab|4f7ce014-2148-4d4...|2b691141-0344-47f...| 5| -81|5c-5b-35-53-a7-a5| 5| -57|5c-5b-35-53-a0-c0|2021-02-09T23:00:...|2021-02-09| -24.0| |10-2c-6b-0c-42-f6|4f7ce014-2148-4d4...|04ba6f1e-e04f-4e8...| 5| -73|5c-5b-35-d0-3e-6c| 5| -46|5c-5b-35-d1-94-8d|2021-02-09T23:00:...|2021-02-09| -27.0| |10-2c-6b-0c-42-f6|4f7ce014-2148-4d4...|04ba6f1e-e04f-4e8...| 5| -73|5c-5b-35-d0-3e-6c| 2.4| -46|5c-5b-35-d1-94-8d|2021-02-09T23:00:...|2021-02-09| -27.0| |10-2c-6b-0c-42-f6|4f7ce014-2148-4d4...|04ba6f1e-e04f-4e8...| 5| -73|5c-5b-35-d0-3e-6c| 5| -59|5c-5b-35-d1-94-6a|2021-02-09T23:00:...|2021-02-09| -14.0| |10-2c-6b-0c-42-f6|4f7ce014-2148-4d4...|04ba6f1e-e04f-4e8...| 5| -73|5c-5b-35-d0-3e-6c| 2.4| -59|5c-5b-35-d1-94-6a|2021-02-09T23:00:...|2021-02-09| -14.0| |10-2c-6b-0c-42-f6|4f7ce014-2148-4d4...|04ba6f1e-e04f-4e8...| 5| -73|5c-5b-35-d0-3e-6c| 5| -65|5c-5b-35-d1-94-47|2021-02-09T23:00:...|2021-02-09| -8.0| |10-2c-6b-0c-42-f6|4f7ce014-2148-4d4...|04ba6f1e-e04f-4e8...| 5| -73|5c-5b-35-d0-3e-6c| 2.4| -65|5c-5b-35-d1-94-47|2021-02-09T23:00:...|2021-02-09| -8.0| |c8-f3-19-58-64-5c|9de2eacf-bc8a-46c...|7d5c97be-c8c2-4a2...| 5| -74|5c-5b-35-cf-62-51| 2.4| -67|5c-5b-35-cf-2f-f7|2021-02-09T23:00:...|2021-02-09| -7.0| |c8-f3-19-58-64-5c|9de2eacf-bc8a-46c...|7d5c97be-c8c2-4a2...| 5| -74|5c-5b-35-cf-62-51| 5| -67|5c-5b-35-cf-2f-f7|2021-02-09T23:00:...|2021-02-09| -7.0| |10-2c-6b-0b-71-c0|4f7ce014-2148-4d4...|bff6e605-1240-4d5...| 5| -80|d4-20-b0-41-5b-da| 5| -69|d4-20-b0-41-5c-e8|2021-02-09T23:00:...|2021-02-09| -11.0| |10-2c-6b-0b-71-c0|4f7ce014-2148-4d4...|bff6e605-1240-4d5...| 5| -80|d4-20-b0-41-5b-da| 2.4| -69|d4-20-b0-41-5c-e8|2021-02-09T23:00:...|2021-02-09| -11.0| |c0-84-7d-c9-0e-1c|4f7ce014-2148-4d4...|9d4f323a-9e5a-4b6...| 2.4| -77|5c-5b-35-d1-92-fd| 5| -71|5c-5b-35-d0-3e-1c|2021-02-09T23:00:...|2021-02-09| -6.0| +-----------------+--------------------+--------------------+----+-----------+-----------------+--------------+--------------+-----------------+--------------------+----------+---------+ only showing top 20 rows
# top 10 orgID for sticky clients
df_sc.groupby("OrgID").agg(F.count("WC").alias("event_count"),F.countDistinct("WC").alias("count_distinct_WC")).withColumn("avg_event_per_WC", F.col("event_count")/F.col("count_distinct_WC")).sort(F.desc("event_count")).show(20,truncate=False)
+------------------------------------+-----------+-----------------+------------------+ |OrgID |event_count|count_distinct_WC|avg_event_per_WC | +------------------------------------+-----------+-----------------+------------------+ |4f7ce014-2148-4d41-88f0-07c9c2e6f16c|11166654 |29319 |380.86749206998877| |0992350f-e897-4719-8671-010a7e4ebf9c|4709050 |29471 |159.7858912150928 | |a15b4c50-378e-4103-974e-ea9014cd281b|1082509 |4640 |233.29935344827587| |758a1f7f-6d65-4d85-90b2-0775b8b476bc|579366 |5169 |112.08473592571097| |9de2eacf-bc8a-46cc-ad0e-e81780f1095b|552072 |13666 |40.39748280403922 | |2cd327d0-d323-4bef-b455-9dca175026b2|501272 |554 |904.8231046931407 | |20101562-d4f8-41aa-aaa9-98349d78dfa0|397515 |15076 |26.367405147253912| |b2e5fc49-d460-42b9-893c-27dbaff943aa|328867 |5677 |57.92971639950678 | |4a7f417d-a4b1-4452-804e-589a29dff85d|201088 |1965 |102.33486005089058| |a5f0b940-965c-4867-ab7c-ba93609872ab|199943 |1367 |146.2640819312363 | |5f71b73b-532f-4c07-b6c1-22cff8942b35|185413 |187 |991.5133689839572 | |7e37ab7a-f873-4247-aa57-d0cc22ad95ef|175999 |6902 |25.499710228919152| |fc35aa7d-7e64-4083-8aed-45730fdd7b43|125508 |430 |291.8790697674419 | |eff7b071-6671-4eb8-94b7-2a20303cf1b4|124537 |2110 |59.02227488151659 | |be47404e-6e38-4546-b93c-00c083622e75|105594 |1406 |75.10241820768137 | |313cd174-b2e1-40cf-8908-b4ba11a7c85f|68939 |891 |77.37261503928171 | |124630b3-a36e-423d-b7eb-e1af323eb906|61618 |632 |97.49683544303798 | |d81f3b9a-c090-499a-9fda-74c13d892a85|48294 |1755 |27.51794871794872 | |790be815-af8f-4dc5-b8c2-e01b8a107db4|39941 |838 |47.66229116945107 | |cd38e94a-9b9c-11e6-9e83-02e208b2d34f|34474 |3206 |10.75296319401123 | +------------------------------------+-----------+-----------------+------------------+ only showing top 20 rows
# count of sticky client per band
sticky_client_count = df_sc.groupby("When","WC","date","Band").agg(F.count("Candidate_AP").alias("RoamCandidate_count"))
sticky_client_count.groupby("RoamCandidate_count").agg(F.count("WC").alias("event_count")).sort(F.desc("event_count")).show(500)
# histogram of roamcandidate_count
+-------------------+-----------+ |RoamCandidate_count|event_count| +-------------------+-----------+ | 2| 5390022| | 1| 1330850| | 4| 885690| | 6| 319467| | 8| 152785| | 3| 125902| | 10| 48368| | 5| 43468| | 16| 40714| | 12| 28312| | 7| 23468| | 14| 17687| | 9| 1864| | 11| 1392| | 13| 1083| | 15| 261| +-------------------+-----------+
df_sc.groupby("Band").agg(F.count("WC").alias("event_count")).show()
temp = df_sc.groupby("date","Band").agg(F.count("WC").alias("event_count"))
+----+-----------+ |Band|event_count| +----+-----------+ | 2.4| 3387450| | | 74| | 5| 17937082| +----+-----------+
%%spark -o temp
%%local
## sticky client event count per band
temp["Band"] = temp["Band"].astype(str)
px.bar(temp,x="date",y="event_count",color="Band",barmode="group")
%%spark -o df_sc
%%local
#Sticky Client Current RSSI Histogram
px.histogram(df_sc,x="CurrentRSSI",title="Sticky Client Current RSSI",color = "Band")
%%local
#Sticky Client Current RSSI Histogram
px.histogram(df_sc,x="RSSI_diff",title="Sticky Client RSSI Diff between Current and Candidate APs",color = "Candidate_Band")
%%local
#Sticky Client Current RSSI Histogram
df_sc["Band_change"] = df_sc[["Band","Candidate_Band"]].apply(lambda x: "{} to {}".format(x[1],x[0]),axis=1)
df_sc["SameAP"] = df["CurrentAP"]
px.histogram(df_sc,x="RSSI_diff",title="Sticky Client RSSI Diff between Current and Candidate APs",color = "Band_change")
df_sc.withColumn("SameAP",F.col("CurrentAP")==F.col("Candidate_AP")).groupby("SameAP").agg(F.count("WC").alias("event_count")).show()
+------+-----------+ |SameAP|event_count| +------+-----------+ | true| 184945| | false| 4169606| +------+-----------+